# import library
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from wordcloud import WordCloud
from plotly import graph_objs as go
# import dataset
df = pd.read_csv (r"D:\Pasca Wisuda\Data analyst\My project portfolio\Dataset\chocolate_bars.csv")
df
| id | manufacturer | company_location | year_reviewed | bean_origin | bar_name | cocoa_percent | num_ingredients | ingredients | review | rating | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2454 | 5150 | U.S.A. | 2019 | Tanzania | Kokoa Kamili, batch 1 | 76.0 | 3.0 | B,S,C | rich cocoa, fatty, bready | 3.25 |
| 1 | 2458 | 5150 | U.S.A. | 2019 | Dominican Republic | Zorzal, batch 1 | 76.0 | 3.0 | B,S,C | cocoa, vegetal, savory | 3.50 |
| 2 | 2454 | 5150 | U.S.A. | 2019 | Madagascar | Bejofo Estate, batch 1 | 76.0 | 3.0 | B,S,C | cocoa, blackberry, full body | 3.75 |
| 3 | 2542 | 5150 | U.S.A. | 2021 | Fiji | Matasawalevu, batch 1 | 68.0 | 3.0 | B,S,C | chewy, off, rubbery | 3.00 |
| 4 | 2546 | 5150 | U.S.A. | 2021 | Venezuela | Sur del Lago, batch 1 | 72.0 | 3.0 | B,S,C | fatty, earthy, moss, nutty,chalky | 3.00 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2525 | 1205 | Zotter | Austria | 2014 | Blend | Raw | 80.0 | 4.0 | B,S*,C,Sa | waxy, cloying, vegetal | 2.75 |
| 2526 | 1996 | Zotter | Austria | 2017 | Colombia | APROCAFA, Acandi | 75.0 | 3.0 | B,S,C | strong nutty, marshmallow | 3.75 |
| 2527 | 2036 | Zotter | Austria | 2018 | Blend | Dry Aged, 30 yr Anniversary bar | 75.0 | 3.0 | B,S,C | fatty, earthy, cocoa | 3.00 |
| 2528 | 2170 | Zotter | Austria | 2018 | Congo | Mountains of the Moon | 70.0 | 3.0 | B,S,C | fatty, mild nuts, mild fruit | 3.25 |
| 2529 | 2170 | Zotter | Austria | 2018 | Belize | Maya Mtn | 72.0 | 3.0 | B,S,C | muted, roasty, accessible | 3.50 |
2530 rows × 11 columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2530 entries, 0 to 2529 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 2530 non-null int64 1 manufacturer 2530 non-null object 2 company_location 2530 non-null object 3 year_reviewed 2530 non-null int64 4 bean_origin 2530 non-null object 5 bar_name 2530 non-null object 6 cocoa_percent 2530 non-null float64 7 num_ingredients 2443 non-null float64 8 ingredients 2443 non-null object 9 review 2530 non-null object 10 rating 2530 non-null float64 dtypes: float64(3), int64(2), object(6) memory usage: 217.6+ KB
# change data type of num_ingredients to not a decimal number (Int64) which allow missing values
df['num_ingredients']=df['num_ingredients'].astype('Int64')
# change data type of cocoa_percent to not decimal number
df['cocoa_percent']=df['cocoa_percent'].astype('int')
# df info after changing the data type
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2530 entries, 0 to 2529 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 2530 non-null int64 1 manufacturer 2530 non-null object 2 company_location 2530 non-null object 3 year_reviewed 2530 non-null int64 4 bean_origin 2530 non-null object 5 bar_name 2530 non-null object 6 cocoa_percent 2530 non-null int32 7 num_ingredients 2443 non-null Int64 8 ingredients 2443 non-null object 9 review 2530 non-null object 10 rating 2530 non-null float64 dtypes: Int64(1), float64(1), int32(1), int64(2), object(6) memory usage: 210.1+ KB
# description for numeric data
df.describe()
| id | year_reviewed | cocoa_percent | num_ingredients | rating | |
|---|---|---|---|---|---|
| count | 2530.000000 | 2530.000000 | 2530.000000 | 2443.0 | 2530.000000 |
| mean | 1429.800791 | 2014.374308 | 71.637945 | 3.041343 | 3.196344 |
| std | 757.648556 | 3.968267 | 5.616953 | 0.913728 | 0.445321 |
| min | 5.000000 | 2006.000000 | 42.000000 | 1.0 | 1.000000 |
| 25% | 802.000000 | 2012.000000 | 70.000000 | 2.0 | 3.000000 |
| 50% | 1454.000000 | 2015.000000 | 70.000000 | 3.0 | 3.250000 |
| 75% | 2079.000000 | 2018.000000 | 74.000000 | 4.0 | 3.500000 |
| max | 2712.000000 | 2021.000000 | 100.000000 | 6.0 | 4.000000 |
# description for object data
df.describe(include='object')
| manufacturer | company_location | bean_origin | bar_name | ingredients | review | |
|---|---|---|---|---|---|---|
| count | 2530 | 2530 | 2530 | 2530 | 2443 | 2530 |
| unique | 580 | 67 | 62 | 1605 | 21 | 2487 |
| top | Soma | U.S.A. | Venezuela | Madagascar | B,S,C | spicy, cocoa |
| freq | 56 | 1136 | 253 | 55 | 999 | 4 |
# count how many duplicate data
df.duplicated().sum()
0
# count how many unique values
df.nunique()
id 630 manufacturer 580 company_location 67 year_reviewed 16 bean_origin 62 bar_name 1605 cocoa_percent 42 num_ingredients 6 ingredients 21 review 2487 rating 12 dtype: int64
# count how many missing values
df.isnull().sum()
id 0 manufacturer 0 company_location 0 year_reviewed 0 bean_origin 0 bar_name 0 cocoa_percent 0 num_ingredients 87 ingredients 87 review 0 rating 0 dtype: int64
# look unique values for column with missing values
df['num_ingredients'].unique()
<IntegerArray> [3, 4, 2, 5, 6, <NA>, 1] Length: 7, dtype: Int64
# look uniques values for column with missing values
df['ingredients'].unique()
array(['B,S,C', 'B,S,C,L', 'B,S', 'B,S,C,V', 'B,S,C,V,L', 'B,S,C,V,L,Sa',
'B,S,C,V,Sa', nan, 'B,S,V,L', 'B,S*', 'B', 'B,S*,C', 'B,S,L',
'B,S,V', 'B,S*,C,L', 'B,S*,C,Sa', 'B,S*,Sa', 'B,S,C,Sa',
'B,S*,V,L', 'B,C', 'B,S*,C,V', 'B,S,C,L,Sa'], dtype=object)
df.columns
Index(['id', 'manufacturer', 'company_location', 'year_reviewed',
'bean_origin', 'bar_name', 'cocoa_percent', 'num_ingredients',
'ingredients', 'review', 'rating'],
dtype='object')
# histogram for numeric data
df.hist(grid=False, layout=(5,4), figsize=(20,15), bins=15)
plt.show()
# top 10 manufacturers with highest number
df['manufacturer'].value_counts().head(10)
manufacturer Soma 56 Fresco 39 Arete 32 Bonnat 30 A. Morin 26 Dandelion 25 Pralus 25 Domori 23 Valrhona 22 Guittard 22 Name: count, dtype: int64
# Bar plot visualization for top 10 chocolate bar manufacturer
mnf = df['manufacturer'].value_counts().head(10).plot(kind='bar', color='#45B39D', figsize = (6,4),
title='Top 10 Chocolate Bar Manufacturer')
mnf.set_xlabel('Manufacturer')
mnf.set_ylabel('Count')
plt.show
<function matplotlib.pyplot.show(close=None, block=None)>
# top 10 highest company locations
df['company_location'].value_counts().head(10)
company_location U.S.A. 1136 Canada 177 France 176 U.K. 133 Italy 78 Belgium 63 Ecuador 58 Australia 53 Switzerland 44 Germany 42 Name: count, dtype: int64
# top 10 bean origin
df['bean_origin'].value_counts().head(10)
bean_origin Venezuela 253 Peru 244 Dominican Republic 226 Ecuador 219 Madagascar 177 Blend 156 Nicaragua 100 Bolivia 80 Tanzania 79 Colombia 79 Name: count, dtype: int64
# count of rating for all the samples
df['rating'].value_counts()
rating 3.50 565 3.00 523 3.25 464 2.75 333 3.75 300 2.50 166 4.00 112 2.00 33 2.25 17 1.50 10 1.00 4 1.75 3 Name: count, dtype: int64
# scatter plot for rating vs number of ingredients
sct = sns.scatterplot(x='rating', y='num_ingredients', data=df)
sct.set_title('Rating vs Number of Ingredients')
plt.show()
# make correlation for numeric data only
df_corr=df.dropna().corr(method='pearson', numeric_only=True)
df_corr
| id | year_reviewed | cocoa_percent | num_ingredients | rating | |
|---|---|---|---|---|---|
| id | 1.000000 | 0.993091 | 0.047619 | -0.356845 | 0.085569 |
| year_reviewed | 0.993091 | 1.000000 | 0.048085 | -0.362956 | 0.087774 |
| cocoa_percent | 0.047619 | 0.048085 | 1.000000 | -0.187792 | -0.076925 |
| num_ingredients | -0.356845 | -0.362956 | -0.187792 | 1.000000 | -0.098834 |
| rating | 0.085569 | 0.087774 | -0.076925 | -0.098834 | 1.000000 |
# heatmap to show correlation for numeric data only
heatmap_corr = px.imshow(df_corr, color_continuous_scale = 'YlGnBu', text_auto='.2f', aspect= 'auto',
title='Correlation Between All numeric columns')
heatmap_corr.show()
# make a copy of dataframe
df2 = df.copy()
# numerized all the columns to get the correlation for each columns
df_numerized = df2
for col_name in df_numerized.columns:
if (df_numerized[col_name].dtype=='object'):
df_numerized[col_name] = df_numerized[col_name].astype('category')
df_numerized[col_name] = df_numerized[col_name].cat.codes
df_numerized
| id | manufacturer | company_location | year_reviewed | bean_origin | bar_name | cocoa_percent | num_ingredients | ingredients | review | rating | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2454 | 0 | 62 | 2019 | 52 | 750 | 76 | 3 | 10 | 1679 | 3.25 |
| 1 | 2458 | 0 | 62 | 2019 | 13 | 1597 | 76 | 3 | 10 | 319 | 3.50 |
| 2 | 2454 | 0 | 62 | 2019 | 28 | 162 | 76 | 3 | 10 | 288 | 3.75 |
| 3 | 2542 | 0 | 62 | 2021 | 16 | 935 | 68 | 3 | 10 | 229 | 3.00 |
| 4 | 2546 | 0 | 62 | 2021 | 60 | 1418 | 72 | 3 | 10 | 742 | 3.00 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2525 | 1205 | 573 | 3 | 2014 | 2 | 1249 | 80 | 4 | 6 | 2452 | 2.75 |
| 2526 | 1996 | 573 | 3 | 2017 | 8 | 11 | 75 | 3 | 10 | 2220 | 3.75 |
| 2527 | 2036 | 573 | 3 | 2018 | 2 | 487 | 75 | 3 | 10 | 741 | 3.00 |
| 2528 | 2170 | 573 | 3 | 2018 | 9 | 1008 | 70 | 3 | 10 | 756 | 3.25 |
| 2529 | 2170 | 573 | 3 | 2018 | 1 | 953 | 72 | 3 | 10 | 1396 | 3.50 |
2530 rows × 11 columns
# correlation table for all the data
df_corr2=df_numerized.dropna().corr(method='pearson')
df_corr2
| id | manufacturer | company_location | year_reviewed | bean_origin | bar_name | cocoa_percent | num_ingredients | ingredients | review | rating | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| id | 1.000000 | 0.020080 | 0.126790 | 0.993091 | 0.058748 | 0.037507 | 0.047619 | -0.356845 | -0.278015 | -0.035936 | 0.085569 |
| manufacturer | 0.020080 | 1.000000 | -0.089814 | 0.018340 | -0.029630 | 0.001810 | 0.037788 | 0.050261 | 0.030292 | 0.021144 | -0.025883 |
| company_location | 0.126790 | -0.089814 | 1.000000 | 0.135263 | 0.080599 | 0.028114 | 0.029851 | -0.216424 | -0.199825 | -0.016504 | -0.073725 |
| year_reviewed | 0.993091 | 0.018340 | 0.135263 | 1.000000 | 0.056912 | 0.035722 | 0.048085 | -0.362956 | -0.281374 | -0.034769 | 0.087774 |
| bean_origin | 0.058748 | -0.029630 | 0.080599 | 0.056912 | 1.000000 | 0.190436 | -0.009190 | -0.031379 | -0.006710 | -0.027607 | 0.037417 |
| bar_name | 0.037507 | 0.001810 | 0.028114 | 0.035722 | 0.190436 | 1.000000 | -0.026203 | -0.015410 | -0.018399 | -0.014992 | 0.000376 |
| cocoa_percent | 0.047619 | 0.037788 | 0.029851 | 0.048085 | -0.009190 | -0.026203 | 1.000000 | -0.187792 | -0.142607 | -0.097574 | -0.076925 |
| num_ingredients | -0.356845 | 0.050261 | -0.216424 | -0.362956 | -0.031379 | -0.015410 | -0.187792 | 1.000000 | 0.892510 | 0.032839 | -0.098834 |
| ingredients | -0.278015 | 0.030292 | -0.199825 | -0.281374 | -0.006710 | -0.018399 | -0.142607 | 0.892510 | 1.000000 | 0.025908 | -0.067373 |
| review | -0.035936 | 0.021144 | -0.016504 | -0.034769 | -0.027607 | -0.014992 | -0.097574 | 0.032839 | 0.025908 | 1.000000 | -0.095686 |
| rating | 0.085569 | -0.025883 | -0.073725 | 0.087774 | 0.037417 | 0.000376 | -0.076925 | -0.098834 | -0.067373 | -0.095686 | 1.000000 |
# heatmap of correlation between all the data
heatmap_corr2 = px.imshow(df_corr2, color_continuous_scale = 'YlGnBu', text_auto='.2f', aspect= 'auto',
title='Correlation Between All the columns')
heatmap_corr2.show()
year vs id, ingredients vs num_ingredients are columns that have high positive correlation each other.
Questions
# Question 1. What bean origin and company of chocolate generate high rating?
high_rating_by_origin = df.groupby(['bean_origin'])[['rating','company_location']].max().sort_values(by='rating', ascending=False).reset_index()
high_rating_by_origin
| bean_origin | rating | company_location | |
|---|---|---|---|
| 0 | Mexico | 4.00 | U.S.A. |
| 1 | Papua New Guinea | 4.00 | U.S.A. |
| 2 | Haiti | 4.00 | U.S.A. |
| 3 | Indonesia | 4.00 | U.S.A. |
| 4 | Jamaica | 4.00 | U.S.A. |
| ... | ... | ... | ... |
| 57 | Sri Lanka | 3.00 | Sweden |
| 58 | Sierra Leone | 3.00 | U.S.A. |
| 59 | Principe | 2.75 | Hungary |
| 60 | St.Vincent-Grenadines | 2.75 | St.Vincent-Grenadines |
| 61 | Martinique | 2.75 | Martinique |
62 rows × 3 columns
Show all the bean origin with highest rating and company location. There are 62 origins with the highest rating (4.00) from Mexico, Papua New Guinea, Haiti, Indonesia, Jamaica, etc.
# create bar chart to visualize highest rating by origin
bar1 = px.bar(high_rating_by_origin, x='bean_origin', y='rating', color='company_location',
color_discrete_sequence= ['#40E0D0', '#CD5C5C', '#1A5276','#6495ED', '#CCCCFF', '#FF5733', '#8E44AD',
'#D68910', '#0B5345', '#839192', '#F7DC6F', '#BFC9CA', px.colors.qualitative.Antique[0],
px.colors.qualitative.Set3[4], px.colors.qualitative.Set3[9], px.colors.qualitative.T10[7],
px.colors.qualitative.D3[0], px.colors.qualitative.Set3[6], px.colors.qualitative.T10[1],
px.colors.qualitative.Set3[0]],
title='Highest Rating by Bean Origin', height=700,
labels={'bean_origin': 'Bean Origin', 'rating': 'Highest Rating', 'company_location':'Company Location'})
bar1.show()
Bar chart showed that the highest rating chocolate bar by bean origin mostly produced from U.S.A company. Other company locations which produced 4.00 rating are U.K, France, Venezuela, Vietnam, and Wales.
#color_continuous_scale = 'YlGnBu'
fig = px.colors.sequential.swatches_continuous()
#fig.show()
#color_discrete_sequence= [px.colors.qualitative.Pastel2[0],px.colors.qualitative.Pastel2[1]]
fig = px.colors.qualitative.swatches()
#fig.show()
# Question 2. What are the characteristics of the highest (4.00) rating chocolate bar?
# show detail values for rating = 4.00
highest_rating = df[df['rating']==4.00]
highest_rating_clean = highest_rating.drop(columns= ['id', 'company_location', 'year_reviewed', 'bar_name']).sort_values(by='cocoa_percent', ascending=False).reset_index(drop=True).rename_axis("Index", axis=0)
highest_rating_clean
| manufacturer | bean_origin | cocoa_percent | num_ingredients | ingredients | review | rating | |
|---|---|---|---|---|---|---|---|
| Index | |||||||
| 0 | Soma | Blend | 88 | 3 | B,S,C | pronounced berry,mild smoke | 4.0 |
| 1 | Scharffen Berger | Peru | 78 | 4 | B,S,C,L | banana, pear, spice, cheese | 4.0 |
| 2 | Bonnat | Venezuela | 75 | 3 | B,S,C | creamy, fatty, floral | 4.0 |
| 3 | Pralus | Sao Tome | 75 | 4 | B,S,C,L | large nibs, spicy, cocoa | 4.0 |
| 4 | Bonnat | Blend | 75 | 3 | B,S,C | creamy, rich, complex | 4.0 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 107 | Valrhona | Madagascar | 64 | 5 | B,S,C,V,L | creamy, blueberry, raspberry | 4.0 |
| 108 | A. Morin | Peru | 63 | 3 | B,S,C | sweet, cocoa, tangerine | 4.0 |
| 109 | Valrhona | Peru | 63 | 5 | B,S,C,V,L | intense, dark berry, grapes | 4.0 |
| 110 | Madecasse (Cinagra) | Madagascar | 63 | 5 | B,S,C,V,L | intense, nutty, cocoa | 4.0 |
| 111 | AMMA | Brazil | 60 | 4 | B,S,C,L | creamy, sweet,cocoa,banana | 4.0 |
112 rows × 7 columns
There are 112 chocolate bars with 4.00 rating
# Characteristics of 4.00 rating chocolate bar
df_characteristics = highest_rating['review'].str.split(',',expand=True).stack().value_counts().reset_index()
df_characteristics.columns = ['Characteristics', 'Count']
df_characteristics
| Characteristics | Count | |
|---|---|---|
| 0 | creamy | 34 |
| 1 | cocoa | 21 |
| 2 | nutty | 15 |
| 3 | spicy | 6 |
| 4 | complex | 5 |
| ... | ... | ... |
| 155 | cherry | 1 |
| 156 | mild strawberry | 1 |
| 157 | molasses | 1 |
| 158 | bright fruit | 1 |
| 159 | cinamon | 1 |
160 rows × 2 columns
There are 160 characteristics of chocolate bar which produced 4.00 rating
# bar chart visualization of characteristics of 4.00 rating chocolate bar
bar2 = px.bar(df_characteristics, x='Characteristics', y='Count', color = 'Count', color_continuous_scale = 'Sunset',
title='Characteristics of Highest Rating Chocolate Bar', height=700,
labels={'Characteristics': 'Characteristics', 'Count': 'Count'})
bar2.show()
The most characteristics of 4.00 rating chocolate bar are creamy, cocoa, and nutty with 34, 21, 15 amount.
# Question 3. What characteristics of chocolate bar with highest rating (4.00) based on ingredient?
# Show Ingredients data which produced 4.00 rating
highest_rating_ingredients = highest_rating_clean['ingredients'].value_counts().reset_index()
highest_rating_ingredients.columns = ['Ingredients', 'Count']
highest_rating_ingredients
| Ingredients | Count | |
|---|---|---|
| 0 | B,S,C | 58 |
| 1 | B,S | 26 |
| 2 | B,S,C,L | 14 |
| 3 | B,S,C,V,L | 11 |
| 4 | B,S,C,V | 2 |
# Visualized Ingredients produced 4.00 rating
bar3 = px.bar(highest_rating_ingredients, x='Ingredients', y='Count', color = 'Count', color_continuous_scale = 'Sunset',
title='Ingredients of Highest Rating Chocolate Bar', height=500, width = 600,
labels={'Ingredients': 'Ingredients', 'Count': 'Count'})
bar3.show()
Bean, Sugar, and Cocoa Butter is the most combination ingredients used that produced 4.00 rating.
Legend : B = Bean S = Sugar C = Cocoa Butter L = Lecithin V = Vanilla
# Try to extract all of characteristics based on ingredients with 4.00 rating
# Extract characteristics for B,S ingredients with 4.00 rating
BS = highest_rating_clean[highest_rating_clean['ingredients']=='B,S']
BS_char = BS['review'].str.split(',',expand=True).stack().value_counts().reset_index()
BS_char.columns = ['Characteristics', 'BS_count']
# Extract characteristics for B,S,C ingredients with 4.00 rating
BSC = highest_rating_clean[highest_rating_clean['ingredients']=='B,S,C']
BSC_char = BSC['review'].str.split(',',expand=True).stack().value_counts().reset_index()
BSC_char.columns = ['Characteristics', 'BSC_count']
# Extract characteristics for B,S,C,L ingredients with 4.00 rating
BSCL = highest_rating_clean[highest_rating_clean['ingredients']=='B,S,C,L']
BSCL_char = BSCL['review'].str.split(',',expand=True).stack().value_counts().reset_index()
BSCL_char.columns = ['Characteristics', 'BSCL_count']
# Extract characteristics for B,S,C,V,L ingredients with 4.00 rating
BSCVL = highest_rating_clean[highest_rating_clean['ingredients']=='B,S,C,V,L']
BSCVL_char = BSCVL['review'].str.split(',',expand=True).stack().value_counts().reset_index()
BSCVL_char.columns = ['Characteristics', 'BSCVL_count']
# Extract characteristics for B,S,C,V ingredients with 4.00 rating
BSCV = highest_rating_clean[highest_rating_clean['ingredients']=='B,S,C,V']
BSCV_char = BSCV['review'].str.split(',',expand=True).stack().value_counts().reset_index()
BSCV_char.columns = ['Characteristics', 'BSCV_count']
# Merge ectracted data above become 1 dataframe
from functools import reduce
chars = [BS_char, BSC_char, BSCL_char, BSCVL_char, BSCV_char]
df_merged = reduce(lambda left,right: pd.merge(left, right, on=['Characteristics'], how='outer'), chars)
df_merged
| Characteristics | BS_count | BSC_count | BSCL_count | BSCVL_count | BSCV_count | |
|---|---|---|---|---|---|---|
| 0 | creamy | 9.0 | 18.0 | 4.0 | 3.0 | NaN |
| 1 | cocoa | 4.0 | 11.0 | 3.0 | 3.0 | NaN |
| 2 | sticky | 3.0 | NaN | NaN | NaN | NaN |
| 3 | smooth | 2.0 | NaN | NaN | NaN | NaN |
| 4 | dried fruit | 2.0 | 2.0 | NaN | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... |
| 154 | green | NaN | NaN | NaN | 1.0 | NaN |
| 155 | tart citrus | NaN | NaN | NaN | 1.0 | NaN |
| 156 | long and rich | NaN | NaN | NaN | 1.0 | NaN |
| 157 | strong spice | NaN | NaN | NaN | NaN | 1.0 |
| 158 | intense pepper | NaN | NaN | NaN | NaN | 1.0 |
159 rows × 6 columns
# Stacked bar chart to visualize characteristics of all ingredients having 4.00 rating
fig5 = go.Figure(data=[
go.Bar(name='B,S (Bean,Sugar)', x=df_merged.Characteristics, y=df_merged.BS_count),
go.Bar(name='B,S,C (Bean,Sugar,Cocoa Butter)', x=df_merged.Characteristics, y=df_merged.BSC_count),
go.Bar(name='B,S,C,L (Bean,Sugar,Cocoa Butter,Lecithin)', x=df_merged.Characteristics, y=df_merged.BSCL_count),
go.Bar(name='B,S,C,V,L (Bean,Sugar,Cocoa Butter,Vanilla,Lecithin)', x=df_merged.Characteristics, y=df_merged.BSCVL_count),
go.Bar(name='B,S,C,V (Bean,Sugar,Cocoa,Vanilla)', x=df_merged.Characteristics, y=df_merged.BSCV_count)
])
fig5.update_layout(barmode='stack', height=600, width=1000, plot_bgcolor='#FFFFFF',
title='Characteristics of Highest Rating Chocolate Bar by Ingredients',
xaxis_title= 'Characteristics',
yaxis_title= 'Count',
legend_title='Ingredients',
legend= dict(x=0.55, y=1.0, bgcolor='rgba(255, 255, 255, 0)',
bordercolor='rgba(255, 255, 255, 0)'),
)
fig5.show()
Stacked bar chart showed the composition each characterisctics based on ingredients
# characteristics in chocolate bar of B,S ingredients with 4.00 rating
BSR = highest_rating_clean[highest_rating_clean['ingredients']=='B,S']
BSR_char = BSR['review'].str.split(',',expand=True).stack().value_counts().reset_index()
BSR_char.columns = ['Characteristics', 'BS_count']
#BSR_char
BSR2 = BSR_char.to_string(index=False).strip()
BSR2
'Characteristics BS_count\n creamy 9\n cocoa 4\n sticky 3\n smooth 2\n dried fruit 2\n coffee 2\n nutty 2\n cherry 2\n complex 2\n tart 2\n tart 1\n woody 1\n choco 1\n sticky 1\n peanut butter 1\n sweet spice 1\n floral 1\n banana 1\n yogurt 1\n nutty 1\n fruit 1\n mint 1\n cherry 1\n olive 1\n balanced 1\n smoke 1\n red berry 1\n nuts 1\n tobacco 1\n dark berry 1\n balanced 1\n astringent 1\n banana 1\n roasty 1\n oily 1\n leather 1\n well defined 1\n creamy 1\n dried fruit 1\n fig 1\n marshmallow 1\n bright fruit 1\n distinquished 1\n cardamon 1\ndistinct choco and graham 1\n chocolate and grapes 1\n rounded 1\n orange 1\n tropical 1\n long lasting 1\n perfectly balanced roast 1'
# word cloud for BS chocolate bar ingredient with 4.00 rating
word_cloud = WordCloud(width = 600, height = 300, background_color = '#FBEEE6', max_words=51, relative_scaling = 0, min_font_size = 10)
word_cloud.generate(BSR2)
plt.imshow(word_cloud, interpolation = 'bilinear')
plt.axis('off')
plt.show()
# Question 4. What is the detailed information about outstanding, highly recommended, recommended, disappointing, and unpleasant flavor chocolate bar?
# create new column to describe the rating, and drop some unused column
df_rating = df.copy()
#df_rating['flavor']=df_rating['rating']
df_rating.loc[df_rating['rating'] >= 1 , 'flavor'] = 'unpleasant'
df_rating.loc[df_rating['rating'] >= 2 , 'flavor'] = 'disappointing'
df_rating.loc[df_rating['rating'] >= 3 , 'flavor'] = 'recommended'
df_rating.loc[df_rating['rating'] >= 3.5 , 'flavor'] = 'highly recommended'
df_rating.loc[df_rating['rating'] >= 4 , 'flavor'] = 'outstanding'
df_rating = df_rating.drop(columns= ['id']).sort_values(by='rating', ascending=False)
df_rating
| manufacturer | company_location | year_reviewed | bean_origin | bar_name | cocoa_percent | num_ingredients | ingredients | review | rating | flavor | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 1829 | Patric | U.S.A. | 2013 | Peru | Piura, Choc. Garage Exclusive | 67 | 3 | B,S,C | creamy, cocoa, grapes | 4.0 | outstanding |
| 575 | Chokola | U.S.A. | 2020 | Madagascar | Bejofo, 2019 H., Batch 20 | 67 | 2 | B,S | cherry, perfectly balanced roast | 4.0 | outstanding |
| 150 | Arete | U.S.A. | 2018 | Colombia | Tumaco | 70 | 2 | B,S | smooth, nutty, cocoa | 4.0 | outstanding |
| 2389 | Valrhona | France | 2007 | Madagascar | Manjari | 64 | 5 | B,S,C,V,L | creamy, blueberry, raspberry | 4.0 | outstanding |
| 2047 | Scharffen Berger | U.S.A. | 2009 | Ghana | Asante | 65 | 5 | B,S,C,V,L | simple, delicate cocoa, long | 4.0 | outstanding |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1587 | Middlebury | U.S.A. | 2012 | Blend | Houseblend | 65 | 4 | B,S*,C,Sa | chemical, salt, wtf | 1.5 | unpleasant |
| 632 | Cote d' Or (Kraft) | Belgium | 2006 | Blend | Sensations Intense | 70 | <NA> | NaN | this is not chocolate | 1.0 | unpleasant |
| 430 | Callebaut | Belgium | 2007 | Ecuador | Baking | 70 | <NA> | NaN | bitter, cocoa | 1.0 | unpleasant |
| 590 | Claudio Corallo | Sao Tome | 2008 | Blend | Principe, Sao Tome & Principe | 100 | <NA> | NaN | chalky, musty, very bitter | 1.0 | unpleasant |
| 1679 | Neuhaus (Callebaut) | Belgium | 2007 | Blend | Dark | 73 | 5 | B,S,C,V,L | pastey, strong off flavor | 1.0 | unpleasant |
2530 rows × 11 columns
df_rating['flavor'].value_counts()
flavor recommended 987 highly recommended 865 disappointing 549 outstanding 112 unpleasant 17 Name: count, dtype: int64
Tabel showed the detail information (include manufacturer, bean_origin, bar_name, cocoa_percent, ingredient, review) about the flavor based on rating of chocolate bar.
# Question 5. What is the detailed information about unpleasant flavor chocolate bar?
df_rating[df_rating['flavor']=='unpleasant']
| manufacturer | company_location | year_reviewed | bean_origin | bar_name | cocoa_percent | num_ingredients | ingredients | review | rating | flavor | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 1180 | Hotel Chocolat | U.K. | 2013 | St. Lucia | Island Growers, 2012, 120hr c., batch 13080 | 100 | <NA> | NaN | pastey, bitter, unfixable | 1.75 | unpleasant |
| 158 | Artisan du Chocolat | U.K. | 2009 | Brazil | Brazil Rio Doce | 72 | 3 | B,S,C | cocoa, dominate off note | 1.75 | unpleasant |
| 164 | Artisan du Chocolat | U.K. | 2010 | Venezuela | Venezuela | 100 | <NA> | NaN | very nutty, very bitter | 1.75 | unpleasant |
| 626 | Coppeneur | Germany | 2012 | Ecuador | Ecuador Puristique | 100 | <NA> | NaN | high intensity bitter | 1.50 | unpleasant |
| 2383 | Valrhona | France | 2007 | Blend | Le Noir Extra Amer | 85 | 5 | B,S,C,V,L | creamy, fatty, very bitter | 1.50 | unpleasant |
| 1423 | Machu Picchu Trading Co. | Peru | 2011 | Peru | Peru | 70 | <NA> | NaN | vanilla, strong chemical | 1.50 | unpleasant |
| 2118 | Snake & Butterfly | U.S.A. | 2011 | Ghana | Ghana | 67 | 3 | B,S,C | perfume, strong chemical | 1.50 | unpleasant |
| 426 | Cacaoyere (Ecuatoriana) | Ecuador | 2008 | Ecuador | Pichincha | 91 | 4 | B,S,C,L | klingy, hint of fruit, very bitter | 1.50 | unpleasant |
| 763 | Dolfin (Belcolade) | Belgium | 2006 | Blend | Noir | 70 | 3 | B,S,L | acidic, astringent, unpleasant | 1.50 | unpleasant |
| 323 | Bouga Cacao (Tulicorp) | Ecuador | 2009 | Ecuador | El Oro, Hacienda de Oro | 100 | <NA> | NaN | cardboard, very bitter, floral | 1.50 | unpleasant |
| 2017 | S.A.I.D. | Italy | 2010 | Blend | 100 percent | 100 | <NA> | NaN | sticky, intense, very bitter | 1.50 | unpleasant |
| 293 | Bonnat | France | 2006 | Blend | One Hundred | 100 | <NA> | NaN | acidic, bitter, dry | 1.50 | unpleasant |
| 1587 | Middlebury | U.S.A. | 2012 | Blend | Houseblend | 65 | 4 | B,S*,C,Sa | chemical, salt, wtf | 1.50 | unpleasant |
| 632 | Cote d' Or (Kraft) | Belgium | 2006 | Blend | Sensations Intense | 70 | <NA> | NaN | this is not chocolate | 1.00 | unpleasant |
| 430 | Callebaut | Belgium | 2007 | Ecuador | Baking | 70 | <NA> | NaN | bitter, cocoa | 1.00 | unpleasant |
| 590 | Claudio Corallo | Sao Tome | 2008 | Blend | Principe, Sao Tome & Principe | 100 | <NA> | NaN | chalky, musty, very bitter | 1.00 | unpleasant |
| 1679 | Neuhaus (Callebaut) | Belgium | 2007 | Blend | Dark | 73 | 5 | B,S,C,V,L | pastey, strong off flavor | 1.00 | unpleasant |
Unpleasant flavor is chocolate bar that has rating around 1.0 - 1.9. Based on the table, unpleasant flavor mostly produced from blend bean, eventhough not all of the blend bean produced low rating of chocolate bar. Based on the correlarion matrix, data that has positive correlation with rating is year_reviewed, bean_origin. While manufacturer, company_location, cocoa percent, num_ingredient, ingredient have negative correlation with rating. This correlation number is low, the highest number of correlation is between rating vs year_reviewed (for positive) and num_ingredients (for negative).
Rating vs year_reviewed has high correlation. Based on the table, unpleasant falvor chocolate bars has old year_reviewed, it can be concluded that old year_reviewed can produced low rating chocolate bar
# Question 6. What rating trends by year?
fig_ry = px.scatter(df, x= 'year_reviewed', y='rating', color='year_reviewed', trendline='ols')
fig_ry.update_layout(height=500, width=800,
title='Scatter Plot and Regression of Rating by Year',
xaxis_title= 'Year',
yaxis_title= 'Rating'
)
fig_ry.show()
Scatter plot and regression line showed that rating and year has positive regression although the number is too low. But, ratings rose as the year progressed, there is no chocolate bar has rating below 2.5 in 2021, that means there is no chocolate bar with unpleasant flavor in 2021.
rating_year = df[['year_reviewed','rating']]
rating_year_avg = rating_year.groupby('year_reviewed').mean().reset_index()
figline_ry = px.line (rating_year_avg, x='year_reviewed', y='rating', markers=True)
figline_ry.update_layout(height=500, width=800,
title='Trend of Chocolate Bar Rating by Year',
xaxis_title= 'Year',
yaxis_title= 'Avg Rating'
)
figline_ry.show()
Based on the line chart or this trend, the average rating of chocolate bar show fluctuation by year, but in 2020 and 2021 the rating rose higher than before. The highest rating is produced in 2017 chocolate bar
# Question 7. What is the correlation between percentage of cocoa with rating?
fig_pr = px.scatter(df, x= 'cocoa_percent', y='rating', color='cocoa_percent', trendline='ols')
fig_pr.update_layout(height=500, width=800,
title='Scatter Plot and Regression of Cocoa Percent and Rating',
xaxis_title= 'Cocoa Percent',
yaxis_title= 'Rating'
)
fig_pr.show()
Scatter plot showed that the rating of chocolate bar decreased by higher percentage of cocoa
# Question 8. What is the correlation between bean origin vs rating?
rating_origin = df[['bean_origin','rating']]
rating_origin_avg = rating_origin.groupby('bean_origin').mean().reset_index()
fig_or = px.scatter(rating_origin_avg, x= 'rating', y='bean_origin', color='rating')
fig_or.update_layout(height=500, width=800,
title='Scatter Plot and Regression of Bean Origin and Rating',
xaxis_title= 'Avg Rating',
yaxis_title= 'Bean Origin'
)
fig_or.show()
There are 5 bean origins produced rating below 2.8 and 1 bean origin produced rating higher than 3.6
Average bean origin have rating around 3 - 3.3 produced recommended flavor
# Question 9. What insights can we gain from Indonesia bean origin?
df_indo = df[df['bean_origin']=='Indonesia'].drop(columns='id').sort_values(by='rating', ascending=False)
df_indo
| manufacturer | company_location | year_reviewed | bean_origin | bar_name | cocoa_percent | num_ingredients | ingredients | review | rating | |
|---|---|---|---|---|---|---|---|---|---|---|
| 235 | Benoit Nihant | Belgium | 2011 | Indonesia | Bali, Sukrama Bros. Farm, Melaya, 62hr C | 72 | 3 | B,S,C | smokey, raspberries, cocoa | 4.00 |
| 50 | Akesson's (Pralus) | U.K. | 2011 | Indonesia | Bali (west), Sukrama Family, Melaya area | 75 | 4 | B,S,C,L | hay, nut, berry | 3.75 |
| 2456 | Willie's Cacao | U.K. | 2010 | Indonesia | Java | 69 | 3 | B,S,C | smokey, fruity, sour | 3.75 |
| 772 | Domori | Italy | 2007 | Indonesia | Java, Javablond | 70 | 2 | B,S | creamy, smokey, tobacco | 3.75 |
| 243 | Beschle (Felchlin) | Switzerland | 2010 | Indonesia | Java, Grand Cru | 64 | 3 | B,S,C | ham-like, smokey, banana | 3.50 |
| 922 | Fossa | Singapore | 2021 | Indonesia | Burang, Popayato-Paquat microlot, Goranulo Pro... | 72 | 2 | B,S | dried fruit (fig), black licorice | 3.50 |
| 167 | Artisan du Chocolat | U.K. | 2010 | Indonesia | Bali | 72 | 3 | B,S,C | black pepper and banana | 3.50 |
| 267 | Bittersweet Origins | U.S.A. | 2010 | Indonesia | Bali, Singaraja | 75 | 2 | B,S | dry, cardamom, medicinal | 3.25 |
| 2159 | Soma | Canada | 2013 | Indonesia | Java | 70 | 3 | B,S,C | creamy, smokey, some fruit | 3.25 |
| 917 | Fossa | Singapore | 2018 | Indonesia | Pak Eddy Farm, Yogyakarta Region | 70 | 2 | B,S | sandy, powdery, late sour | 3.00 |
| 1835 | Pierre Marcolini | Belgium | 2006 | Indonesia | Kendem Lembu, Java | 72 | 5 | B,S,C,V,L | red fruit, smokey, sour | 3.00 |
| 1899 | Pralus | France | 2019 | Indonesia | Java | 75 | 4 | B,S,C,L | fruit, woody, ashey, fatty | 3.00 |
| 264 | Bittersweet Origins | U.S.A. | 2010 | Indonesia | Bali, Singaraja | 65 | 2 | B,S | pastey, off spicey, nutty | 2.75 |
| 75 | Amano | U.S.A. | 2008 | Indonesia | Bali, Jembrana | 70 | 4 | B,S,C,V | sticky, nutty, high acidity | 2.75 |
| 1637 | Molucca | U.S.A. | 2015 | Indonesia | Indonesia | 70 | 2 | B,S | intense, smokey, floral | 2.75 |
| 2015 | Ruket | Italy | 2021 | Indonesia | Sumba, lot 02IN | 77 | 2 | B,S | mild smoke, tart, spice, harsh | 2.75 |
| 160 | Artisan du Chocolat | U.K. | 2009 | Indonesia | Java | 72 | 3 | B,S,C | gritty, very smokey, sour | 2.75 |
| 1131 | Heinde & Verre | Netherlands | 2019 | Indonesia | Pulau, Bali | 71 | 4 | B,S,C,L | rubber, fatty, sandy,grape | 2.50 |
| 1589 | Middlebury | U.S.A. | 2012 | Indonesia | Balinese, Java | 70 | 4 | B,S*,C,Sa | leather, salt | 2.50 |
| 2457 | Willie's Cacao | U.K. | 2013 | Indonesia | Java, Indonesian Black | 100 | <NA> | NaN | fruit, strong smoke, bitter | 2.25 |
df_indo['manufacturer'].nunique()
16
df_indo['company_location'].nunique()
9
Indonesia cocoa bean used by 16 manufacturer and 9 company location, with highest rating (4.00) chocolate bar produced from Benoit Nihant, Belgium in 2011 using 72% cocoa, and ingredient of bean, sugar, cocoa butter.
Lowest rating is 2.25, was produced from Willie's Cacao, UK in 2013 with 100% cocoa that has bitter characteristic.
# Question 10. What most ingredient effect rating of chocolate bar?
df_ingredients = df.copy()
IngredientsDics = {
"Beans": "B",
"Sugar": "S",
"Sweetener": "S*",
"Cocoa Butter": "C",
"Vanilla": "V",
"Lecithin": "L",
"Salt": "Sa"
}
# Creating new columns for each ingredient and make it numeric
for key in IngredientsDics:
df_ingredients[key] = np.where(df_ingredients['ingredients'].str.contains(IngredientsDics[key],case=True), 1, 0)
df_ingredients2 = df_ingredients.drop(columns=['cocoa_percent','num_ingredients','id','manufacturer','company_location','year_reviewed','bean_origin','bar_name','ingredients','review']).sort_values(by='rating', ascending=False)
df_ingredients2
| rating | Beans | Sugar | Sweetener | Cocoa Butter | Vanilla | Lecithin | Salt | |
|---|---|---|---|---|---|---|---|---|
| 1829 | 4.0 | 1 | 1 | 1 | 1 | 0 | 0 | 0 |
| 575 | 4.0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 |
| 150 | 4.0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 |
| 2389 | 4.0 | 1 | 1 | 1 | 1 | 1 | 1 | 0 |
| 2047 | 4.0 | 1 | 1 | 1 | 1 | 1 | 1 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1587 | 1.5 | 1 | 1 | 1 | 1 | 0 | 0 | 1 |
| 632 | 1.0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
| 430 | 1.0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
| 590 | 1.0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
| 1679 | 1.0 | 1 | 1 | 1 | 1 | 1 | 1 | 0 |
2530 rows × 8 columns
# build correlation heatmap to know ingredient that highly correlate with rating
df_ingredients2_corr=df_ingredients2.dropna().corr(method='pearson', numeric_only=True)
df_ingredients2_corr
ing_corr = px.imshow(df_ingredients2_corr, color_continuous_scale = 'YlGnBu', text_auto='.2f', aspect= 'auto',
title='Correlation Between Ingredients and Rating')
ing_corr.show()
Ingredients that have high correlation with rating are vanilla, salt, and lecithin (negative correlation). It can be concluded that vanilla, salt, and lecithin can produced low rating of chocolate bar